In [ ]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from glob import glob
import IPython.display as ipd
from tqdm.notebook import tqdm
import subprocess
import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import split_on_silence
from textblob import TextBlob
import IPython.display as ipd
import os
import nltk
import seaborn as sns
from pydub import AudioSegment
from pydub.effects import normalize, low_pass_filter, high_pass_filter
from vosk import Model, KaldiRecognizer
import wave
import json
加载视频¶
In [ ]:
input_file = 'Experimenter_CREW_999_1_All_1731617801.mp4'
In [ ]:
ipd.Video(input_file,width= 700)
Out[ ]:
打开视频并读取元数据¶
In [ ]:
# 加载视频采集
cap = cv2.VideoCapture(input_file)
In [ ]:
# 帧总数 frames
noFrames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
# 视频高度和宽度
framesHeight = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # Cap prop frame height
framesWidth = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # Cap prop frame width
# 获取每秒帧数
framesSecond = cap.get(cv2.CAP_PROP_FPS)
print("Total number of frames in the video is",noFrames)
print("Frame Height in the video is",noFrames)
print("Frame Width in the video is",noFrames)
print("Frame per second is",framesSecond)
Total number of frames in the video is 10691.0 Frame Height in the video is 10691.0 Frame Width in the video is 10691.0 Frame per second is 29.964424428092247
In [ ]:
# 一旦完成工作,就释放视频,使 Python 不再使用视频
cap.release()
从视频中提取图像¶
In [ ]:
cap = cv2.VideoCapture(input_file)
In [ ]:
ret, img = cap.read()
print(f'Returned {ret} and img of shape {img.shape} ')
Returned True and img of shape (2160, 3840, 3)
In [ ]:
# 绘制 opencv 图像的辅助函数
def display_cv2_img(img, figsize = (10,10)):
img_ = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
fig, ax = plt.subplots(figsize = figsize)
ax.imshow(img_)
ax.axis("off")
In [ ]:
display_cv2_img(img)
In [ ]:
cap.release()
显示视频中的多个帧¶
In [ ]:
# 创建一个 5x5 的网格,整个图像区域为 30x20 英寸
fig, axs = plt.subplots(5, 5, figsize=(30, 20))
# 将返回的二维 axs 数组转换为一维数组,可以像访问列表一样方便地访问每个子图
axs = axs.flatten()
# 读取视频文件
cap = cv2.VideoCapture(input_file) # 打开指定路径的 input_file 视频文件
n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) # 获取视频的总帧数,并将其转换为整数。
img_idx = 0 # 初始化索引 img_idx,选择不同的子图区域进行绘制
for frame in range(n_frames): # 迭代视频的每一帧
ret, img = cap.read() # 读取视频的当前帧, img 为该帧的图像数据
if not ret: # 如果成功读取,ret 为 True
break
if frame % 100 == 0: # 每 100 帧读取一次。这样只处理视频中的一部分帧,而不是每一帧,节省内存和计算资源
if img_idx < len(axs): # 确保 img_idx 不超过 25
axs[img_idx].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # 使用 OpenCV 读取的图像默认是 BGR 格式,而 Matplotlib 显示图像时使用 RGB 格式,用 cv2.cvtColor 将图像从 BGR 转换为 RGB。然后在子图区域中显示这帧图像。
axs[img_idx].set_title(f'Frame: {frame}') # 设置当前子图的标题,显示视频帧的索引
axs[img_idx].axis('off') # 关闭子图的坐标轴,以便仅显示图像
img_idx += 1 # 更新子图索引,确保下次图像显示在下一个子图中
plt.tight_layout() # 调整子图之间的间距,使得子图不会重叠
plt.show() # 显示绘制的所有图像
cap.release() # 释放视频捕获对象 cap,关闭视频文件并释放相关资源
提取音频(wav 格式)¶
In [ ]:
# 音频转换为 单声道(1 声道)、16kHz 采样率、线性 PCM 16-bit,符合 API 要求
audio_output = "audio.wav" # 音频输出文件路径, 输出保存为 audio.wav
#subprocess.run(["ffmpeg", "-i", input_file, "-q:a", "0", "-map", "a", audio_output])
subprocess.run( # 通过 Python 的 subprocess 模块调用外部命令(在这里是 ffmpeg 命令)。subprocess.run() 运行后会等待命令执行完成。
[ # 传递给 ffmpeg 的命令和参数。
"ffmpeg", # 调用 ffmpeg 工具来处理音频文件
"-i", input_file, # 指定输入文件,input_file 是音频/视频文件的路径,ffmpeg 会从中提取音频
"-ac", "1", # 设置音频的声道数为 1,即将音频转换为单声道(Mono), 符合 API 要求的单声道音频。
"-ar", "16000", # 设置音频的采样率为 16 kHz(16,000 Hz)。常用的语音采样率,符合音频处理或语音识别 API 的要求
"-acodec", "pcm_s16le", # 指定音频编码格式为 pcm_s16le,即线性 PCM 编码,16-bit 小端格式(未压缩的音频格式,常用于高质量音频)
audio_output # 输出文件路径,转换后的音频将保存到这个路径(即 audio.wav)
]
)
print(f"音频已保存为 {audio_output}")
ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
built with Apple clang version 16.0.0 (clang-1600.0.26.6)
configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev=jack --enable-videotoolbox --enable-audiotoolbox --enable-neon
libavutil 59. 39.100 / 59. 39.100
libavcodec 61. 19.101 / 61. 19.101
libavformat 61. 7.100 / 61. 7.100
libavdevice 61. 3.100 / 61. 3.100
libavfilter 10. 4.100 / 10. 4.100
libswscale 8. 3.100 / 8. 3.100
libswresample 5. 3.100 / 5. 3.100
libpostproc 58. 3.100 / 58. 3.100
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'Experimenter_CREW_999_1_All_1731617801.mp4':
Metadata:
major_brand : mp42
minor_version : 1
compatible_brands: isommp41mp42
creation_time : 2025-03-04T22:42:51.000000Z
Duration: 00:05:57.77, start: 0.000000, bitrate: 4031 kb/s
Stream #0:0[0x1](eng): Video: hevc (Main) (hvc1 / 0x31637668), yuvj420p(pc), 3840x2160 [SAR 1:1 DAR 16:9], 3880 kb/s, 29.96 fps, 29.97 tbr, 30k tbn (default)
Metadata:
creation_time : 2025-03-04T22:42:51.000000Z
handler_name : Core Media Video
vendor_id : [0][0][0][0]
encoder : HEVC Coding
Stream #0:1[0x2](eng): Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, stereo, fltp, 157 kb/s (default)
Metadata:
creation_time : 2025-03-04T22:42:51.000000Z
handler_name : Core Media Audio
vendor_id : [0][0][0][0]
Stream mapping:
Stream #0:1 -> #0:0 (aac (native) -> pcm_s16le (native))
Press [q] to stop, [?] for help
Output #0, wav, to 'audio.wav':
Metadata:
major_brand : mp42
minor_version : 1
compatible_brands: isommp41mp42
ISFT : Lavf61.7.100
Stream #0:0(eng): Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, mono, s16, 256 kb/s (default)
Metadata:
creation_time : 2025-03-04T22:42:51.000000Z
handler_name : Core Media Audio
vendor_id : [0][0][0][0]
encoder : Lavc61.19.101 pcm_s16le
音频已保存为 audio.wav
[out#0/wav @ 0x14c10cc20] video:0KiB audio:11180KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.000681% size= 11180KiB time=00:05:57.77 bitrate= 256.0kbits/s speed=1.83e+03x
In [ ]:
print(os.path.exists("audio.wav")) # 确保文件存在
print(os.path.getsize("audio.wav")) # 确保文件大小不是 0
ipd.Audio("audio.wav")
True 11448816
Out[ ]: